{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "253e89df-7f99-4455-acec-ae38d4d6f098",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_path = \"..\\data\\wzq-medical-cases.xlsx\" # The path of original medical cases\n",
    "output_file = \"..\\data\\output.xlsx\" # Define the path of the output Excel file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb5e03c5-9c2e-4ac5-9d84-b12dc65611fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel(file_path)\n",
    "texts = df['Original medical cases']\n",
    "print(texts[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e05e6f82-9166-4735-aab5-8ffa09cf2787",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract prompts required for different entities\n",
    "class ExtractEntityPrompt:\n",
    "    def extract_symptom_few_shot_prompt(self):\n",
    "        return '''你的任务是提取出文本中的症状，用列表的格式列出，若文本中无症状请输出Output:“NaN”。以下是一些例子：\n",
    "            Input: 腰疽愈后，肾伤未复，风邪易乘虚而袭，身热形寒，头脑胀痛眩晕，项背肩胛酸胀，脉浮而濡，舌干而燥。\n",
    "            Output: [‘身热形寒’, ‘头脑胀痛眩晕’, ‘项背肩胛酸胀’, ‘脉浮而濡’, ‘舌干而燥’]\n",
    "            Input:再以宣湿除陈，以醒胃气，调理尚须从缓也。\n",
    "            Output: NaN\n",
    "            Input: 燥屎已下，郁热下行，颧红、唇绛、舌赤均已退淡，芒刺亦软，惟午后稍觉烦躁，入夜犹欠清爽。\n",
    "            Output: [‘惟午后稍觉烦躁’, ‘入夜犹欠清爽’]'''\n",
    "\n",
    "    def extract_pathogenesis_few_shot_prompt(self):\n",
    "        return '''你的任务是提取出文本中的病因，用列表的格式列出，若文本中无病因请输出Output:“NaN”。以下是一些例子：\n",
    "            Input:寒温失调，感受时行伤风，头眩胀痛，咳嗽发热，鼻塞多涕，欲作呕恶，两手脉俱反关，弦劲而滑。仍以轻宣，化风豁痰。\n",
    "            Output:['感受时行伤风']\n",
    "            Input:哕逆既平，热亦见退，大便已行，舌苔黄糙化薄，肌肤瘙痒，脉弦滑而濡。\n",
    "            Output:NaN\n",
    "            Input:劳顿感风，引动伏湿，肺胃二气相迫，发热壮盛，哕逆呃忒，连声紧促，夜眠不安，脉弦数。\n",
    "            Output:['劳顿感风，引动伏湿', '肺胃二气相迫']'''\n",
    "    def extract_treatment_principle_few_shot_prompt(self):\n",
    "        return '''你的任务是提取出文本中的治则，用列表的格式列出，若文本中无治则治法请输出Output:“NaN”。以下是一些例子：\n",
    "            Input:寒温失调，感受时行伤风，头眩胀痛，咳嗽发热，鼻塞多涕，欲作呕恶，两手脉俱反关，弦劲而滑。仍以轻宣，化风豁痰。\n",
    "            Output:['轻宣'，'化风豁痰']\n",
    "            Input:外因已却，形色清爽，惟脑后右边筋脉牵强拘急，应及腰脊，此则腰疽之后，肾伤未复。\n",
    "            Output:NaN\n",
    "            Input:风邪与痰滯相搏，寒热、咳嗽、头痛、呓语、若寐，亟以宣豁，防痰结生变。\n",
    "            Output:['宣豁']'''\n",
    "        \n",
    "extractor = ExtractEntityPrompt()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f2f70d08-7f90-46df-bb24-1a10a421c716",
   "metadata": {},
   "outputs": [],
   "source": [
    "from zhipuai import ZhipuAI\n",
    "client = ZhipuAI(api_key=\"zhipu_api_key\") # Fill in your own APIKey\n",
    "entity_type = 'symptom' # Fill in the entity type to be extracted\n",
    "few_shot_prompt = f'extract_{entity_type}_few_shot_prompt'\n",
    "\n",
    "def extract(text):\n",
    "    response = client.chat.completions.create(\n",
    "        model=\"glm-3-turbo\",  # Fill in the name of the model to be called\n",
    "        messages=[\n",
    "            {\"role\": \"system\", \"content\": getattr(extractor, few_shot_prompt)()},\n",
    "            {\"role\": \"user\", \"content\": '''用给出的格式提取以下文本中的症状。\n",
    "            Input: ''' + text + '''\n",
    "            Output:'''}\n",
    "        ], # Modify the corresponding prompt content based on the entity to be extracted\n",
    "    )\n",
    "    result = response.choices[0].message\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fec632aa-b43f-47fe-8aaa-9e73f1be647d",
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.ExcelWriter(output_file, engine='openpyxl', mode='w') as writer:\n",
    "    pd.DataFrame(columns=['Original medical cases', entity_type]).to_excel(writer, sheet_name='Sheet1', index=False)\n",
    "    columns = ['Original medical cases', entity_type]\n",
    "    n = 0\n",
    "    startrow = 1\n",
    "    \n",
    "    for text in texts:\n",
    "        n += 1\n",
    "        entities = extract(text).content\n",
    "        fill_text = entities if entities else [float('NaN')]  # If there is no symptom information, fill in 'NaN'\n",
    "        df = pd.DataFrame([{columns[0]: text, columns[1]: fill_text}]) # Adjust the columns in the output Excel here\n",
    "        df.to_excel(writer, sheet_name='Sheet1', index=False, header=(n == 0), startrow=startrow)\n",
    "        startrow += 1\n",
    "        print(f\"Article {n} Medical case entity - {entity_type} extraction completed\")\n",
    "        \n",
    "print(f\"Data saved to {output_file}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "QAM",
   "language": "python",
   "name": "qam"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
